{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np \n",
    "import matplotlib.pyplot as plt \n",
    "import seaborn as sns \n",
    "from tqdm import tqdm \n",
    "\n",
    "import csop_helper as csop\n",
    "import csop_pctile as csop_pctile \n",
    "import ast \n",
    "\n",
    "from sklearn.preprocessing import StandardScaler"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": "true"
   },
   "source": [
    "# Import data and define utilities: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_p1 = pd.read_pickle(\"./phase_1_processed.pkl\")\n",
    "\n",
    "df_p2 = pd.read_pickle(\"./rounds_data_phase2_processed.pkl\").query(\"group_formation == 'real-group'\").reset_index(drop=True)\n",
    "df_p2['set_workerIds'] = [set(ast.literal_eval(x)) for x in df_p2['workerIds']]\n",
    "\n",
    "df_players = pd.read_csv(\"players.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "def get_pctile_features(df, phase):\n",
    "    soln_pctile_lists = []\n",
    "\n",
    "    for row in tqdm(df.itertuples()):\n",
    "        soln_pctile_lists.append(csop_pctile.get_soln_pctiles(row.task_index, row.parsed_intermediateSolutions, phase=phase))\n",
    "        \n",
    "    df['soln_pctile'] = soln_pctile_lists\n",
    "    df['nominal_soln_pctile'] = [x[0] for x in df['soln_pctile'].values]\n",
    "    df['real_soln_pctile'] = [x[1] for x in df['soln_pctile'].values]\n",
    "\n",
    "    df['mean_nominal_pctile'] = [np.mean(x[1:]) for x in df['nominal_soln_pctile'].values]\n",
    "    df['mean_real_pctile'] = [np.mean(x[1:]) for x in df['real_soln_pctile'].values]\n",
    "    df['nominal_real_pctile_gap'] = df.apply(lambda x: np.array(x.nominal_soln_pctile[1:]) - np.array(x.real_soln_pctile[1:]), axis=1)\n",
    "    df['mean_nominal_real_pctile_gap'] = [np.mean(x) for x in df['nominal_real_pctile_gap'].values]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "def get_n_female(set_workerIds):\n",
    "    return np.sum(df_players.query(\"workerId in @set_workerIds\")['gender'].values == 'female')\n",
    "\n",
    "def get_n_college(set_workerIds):\n",
    "    return np.sum([x in {\"bachelor\", \"master\"} for x in df_players.query(\"workerId in @set_workerIds\")['education'].values])\n",
    "\n",
    "def get_n_p1q1_exploration(set_workerIds):\n",
    "    return np.sum(df_players.query(\"workerId in @set_workerIds\")['step1_strategyQ1'].values == 'exploration')\n",
    "\n",
    "def get_n_p1q2_tolconflict(set_workerIds):\n",
    "    return np.sum(df_players.query(\"workerId in @set_workerIds\")['step1_strategyQ2'].values == 'tolerate-conflict')\n",
    "\n",
    "def get_n_p1q3_optimization(set_workerIds):\n",
    "    return np.sum(df_players.query(\"workerId in @set_workerIds\")['step1_strategyQ3'].values == 'optimization')\n",
    "\n",
    "def get_age_mean_std(set_workerIds):\n",
    "    return np.mean(df_players.query(\"workerId in @set_workerIds\")['age']), np.std(df_players.query(\"workerId in @set_workerIds\")['age'])\n",
    "\n",
    "def get_skill_mean_std(set_workerIds):\n",
    "    return np.mean(df_players.query(\"workerId in @set_workerIds\")['skill_CSOP']), np.std(df_players.query(\"workerId in @set_workerIds\")['skill_CSOP'])\n",
    "\n",
    "def get_social_mean_std(set_workerIds):\n",
    "    return np.mean(df_players.query(\"workerId in @set_workerIds\")['social_perceptiveness']), np.std(df_players.query(\"workerId in @set_workerIds\")['social_perceptiveness'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": "true"
   },
   "source": [
    "# Demographic measures "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_teams = df_p2.drop_duplicates(subset=[\"game_id\", \"workerIds\"])[['game_id', 'set_workerIds']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_teams['n_female'] = df_teams.apply(lambda x: get_n_female(x.set_workerIds), axis=1)\n",
    "df_teams['n_college'] = df_teams.apply(lambda x: get_n_college(x.set_workerIds), axis=1)\n",
    "df_teams['n_p1q1_exploration'] = df_teams.apply(lambda x: get_n_p1q1_exploration(x.set_workerIds), axis=1)\n",
    "df_teams['n_p1q2_tolconflict'] = df_teams.apply(lambda x: get_n_p1q2_tolconflict(x.set_workerIds), axis=1)\n",
    "df_teams['n_p1q3_optimization'] = df_teams.apply(lambda x: get_n_p1q3_optimization(x.set_workerIds), axis=1)\n",
    "df_teams[['age_mean','age_std']] = df_teams.apply(lambda x: get_age_mean_std(x.set_workerIds), axis=1, result_type=\"expand\")\n",
    "df_teams[['skill_mean','skill_std']] = df_teams.apply(lambda x: get_skill_mean_std(x.set_workerIds), axis=1, result_type=\"expand\")\n",
    "df_teams[['social_mean','social_std']] = df_teams.apply(lambda x: get_social_mean_std(x.set_workerIds), axis=1, result_type=\"expand\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": "true"
   },
   "source": [
    "# Phase 1 behavioral measures "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "def get_cogstyle_features(set_workerIds):\n",
    "    cogstyle_matrix = np.array(df_p1_avg.query(\"workerIds in @set_workerIds\").cogstyle_vector.values.tolist())\n",
    "    mean_gap, mean_int_solns_per_min, mean_num_postcomplete_solns = cogstyle_matrix.mean(axis=0)\n",
    "    \n",
    "    ab = cogstyle_matrix[1,:] - cogstyle_matrix[0,:]\n",
    "    ac = cogstyle_matrix[2,:] - cogstyle_matrix[0,:]\n",
    "    diversity = np.linalg.norm(np.cross(ab, ac))/2\n",
    "    \n",
    "    return mean_gap, mean_int_solns_per_min, mean_num_postcomplete_solns, diversity\n",
    "\n",
    "def get_p1_efficiency_mean_std(set_workerIds):\n",
    "    return np.mean(df_p1_efficiency.query(\"workerIds in @set_workerIds\")['cumulative_p1_efficiency']), np.std(df_p1_efficiency.query(\"workerIds in @set_workerIds\")['cumulative_p1_efficiency'])\n",
    "\n",
    "def get_p1_duration_mean_std(set_workerIds):\n",
    "    return np.mean(df_p1_efficiency.query(\"workerIds in @set_workerIds\")['cumulative_p1_duration']), np.std(df_p1_efficiency.query(\"workerIds in @set_workerIds\")['cumulative_p1_duration'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "6116it [00:20, 300.05it/s]\n",
      "/Users/mosobay/opt/anaconda3/envs/py38/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3419: RuntimeWarning: Mean of empty slice.\n",
      "  return _methods._mean(a, axis=axis, dtype=dtype,\n",
      "/Users/mosobay/opt/anaconda3/envs/py38/lib/python3.8/site-packages/numpy/core/_methods.py:188: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  ret = ret.dtype.type(ret / rcount)\n"
     ]
    }
   ],
   "source": [
    "get_pctile_features(df_p1, phase=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_p1['num_violations_added'] = df_p1.apply(lambda x: x.round_data.query(\"delta_violations > 0\")['delta_violations'].sum(), axis=1)\n",
    "df_p1['num_precomplete_solns'] = df_p1['ROUNDFEAT_SOLNS_index_first_complete'] + 1\n",
    "df_p1['num_postcomplete_solns'] = df_p1['ROUNDFEAT_SOLNS_num_inter_soln'] - df_p1['num_precomplete_solns']\n",
    "\n",
    "df_p1['mean_active_violations'] = df_p1.apply(lambda x: x.round_data['num_violations'].mean(), axis=1)\n",
    "df_p1['int_solns_per_min'] = df_p1['ROUNDFEAT_SOLNS_num_inter_soln'] / df_p1['round_duration']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "p1_col_list = [\"num_postcomplete_solns\", \"num_violations_added\", \"mean_active_violations\", \"int_solns_per_min\", \"score\"]\n",
    "for round_index in df_p1['round_index'].unique():\n",
    "    scaler = StandardScaler()\n",
    "    df_p1.loc[df_p1['round_index'] == round_index, [\"zscore_\"+x for x in p1_col_list]] = scaler.fit_transform(df_p1.loc[df_p1['round_index'] == round_index, p1_col_list])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "cog_style_features = ['mean_nominal_real_pctile_gap', 'zscore_int_solns_per_min', 'zscore_num_postcomplete_solns']\n",
    "df_p1_avg = df_p1.query(\"complexity_cat == 'Moderate'\").groupby(\"workerIds\")[cog_style_features].mean().reset_index()\n",
    "df_p1_avg['cogstyle_vector'] = [np.array(x) for x in df_p1_avg[cog_style_features].values.tolist()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_teams[['mean_nominal_real_gap', 'mean_zscore_int_solns_per_min', 'mean_zscore_num_postcomplete', 'cogstyle_diversity']] = df_teams.apply(lambda x: get_cogstyle_features(x.set_workerIds), axis=1, result_type=\"expand\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_p1_efficiency = (df_p1.query(\"complexity_cat == 'Moderate'\").groupby(\"workerIds\")[['score', 'round_duration']].sum()\n",
    "                    .assign(cumulative_p1_efficiency = lambda x: x.score / x.round_duration).reset_index()\n",
    "                    .rename(columns={'round_duration':'cumulative_p1_duration'}))[['workerIds', 'cumulative_p1_efficiency', 'cumulative_p1_duration']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_teams[['p1_efficiency_mean','p1_efficiency_std']] = df_teams.apply(lambda x: get_p1_efficiency_mean_std(x.set_workerIds), axis=1, result_type=\"expand\")\n",
    "df_teams[['p1_duration_mean','p1_duration_std']] = df_teams.apply(lambda x: get_p1_duration_mean_std(x.set_workerIds), axis=1, result_type=\"expand\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": "true"
   },
   "source": [
    "# Format outcome variables (Phase 2 performance) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_performance = pd.pivot_table(df_p2.groupby(['game_id', 'task_index'])[['normalized_score', 'round_duration']].sum().reset_index(),\n",
    "                                values=[\"normalized_score\", \"round_duration\"], index=\"game_id\", columns=\"task_index\")\n",
    "\n",
    "df_performance.columns = [\"p2_t{}_normscore\".format(x) for x in range(1,6)] + [\"p2_t{}_duration\".format(x) for x in range(1,6)]\n",
    "df_performance['p2_cumulative_normscore'] = df_performance.filter(regex=\"p2_t[0-9]_normscore\").sum(axis=1)\n",
    "df_performance['p2_cumulative_duration'] = df_performance.filter(regex=\"p2_t[0-9]_duration\").sum(axis=1)\n",
    "df_performance['p2_cumulative_efficiency'] = df_performance['p2_cumulative_normscore'] / df_performance['p2_cumulative_duration']\n",
    "\n",
    "for task_index in range(1,6):\n",
    "    df_performance['p2_t{}_efficiency'.format(task_index)] = df_performance['p2_t{}_normscore'.format(task_index)] / df_performance['p2_t{}_duration'.format(task_index)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": "true"
   },
   "source": [
    "# Final merge and export "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_export = (df_teams.merge(df_performance, on=\"game_id\", how=\"inner\")\n",
    "             .assign(p1q1_exploration_div = lambda x: (x.n_p1q1_exploration != 0) & (x.n_p1q1_exploration != 3))\n",
    "             .assign(p1q2_tolconflict_div = lambda x: (x.n_p1q2_tolconflict != 0) & (x.n_p1q2_tolconflict != 3))\n",
    "             .assign(p1q3_optimization_div = lambda x: (x.n_p1q3_optimization != 0) & (x.n_p1q3_optimization != 3)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(game_id                          0\n",
       " set_workerIds                    0\n",
       " n_female                         0\n",
       " n_college                        0\n",
       " n_p1q1_exploration               0\n",
       " n_p1q2_tolconflict               0\n",
       " n_p1q3_optimization              0\n",
       " age_mean                         0\n",
       " age_std                          0\n",
       " skill_mean                       0\n",
       " skill_std                        0\n",
       " social_mean                      0\n",
       " social_std                       0\n",
       " mean_nominal_real_gap            0\n",
       " mean_zscore_int_solns_per_min    0\n",
       " mean_zscore_num_postcomplete     0\n",
       " cogstyle_diversity               0\n",
       " p1_efficiency_mean               0\n",
       " p1_efficiency_std                0\n",
       " p1_duration_mean                 0\n",
       " p1_duration_std                  0\n",
       " p2_t1_normscore                  0\n",
       " p2_t2_normscore                  0\n",
       " p2_t3_normscore                  0\n",
       " p2_t4_normscore                  0\n",
       " p2_t5_normscore                  0\n",
       " p2_t1_duration                   0\n",
       " p2_t2_duration                   0\n",
       " p2_t3_duration                   0\n",
       " p2_t4_duration                   0\n",
       " p2_t5_duration                   0\n",
       " p2_cumulative_normscore          0\n",
       " p2_cumulative_duration           0\n",
       " p2_cumulative_efficiency         0\n",
       " p2_t1_efficiency                 0\n",
       " p2_t2_efficiency                 0\n",
       " p2_t3_efficiency                 0\n",
       " p2_t4_efficiency                 0\n",
       " p2_t5_efficiency                 0\n",
       " p1q1_exploration_div             0\n",
       " p1q2_tolconflict_div             0\n",
       " p1q3_optimization_div            0\n",
       " dtype: int64,\n",
       " (196, 42))"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Data completeness and size check \n",
    "df_export.isnull().sum(), df_export.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "heading_collapsed": "false"
   },
   "outputs": [],
   "source": [
    "df_export.to_pickle(\"prediction_features.pkl\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py38",
   "language": "python",
   "name": "py38"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
